import os
import pandas as pd
from datetime import datetime
import geopy.distance
import matplotlib.pyplot as plt
import pylab as pl
from statsmodels.tsa.seasonal import seasonal_decompose
import pymongo
import plotly.express as px
# Vient lire les dossiers présents dans "DATA" où chaque dossier correspond à un User
# On prend chacun de ces users et on crée un DataFrame avec l'ID du user, son path, et savoir si le dossier contient le fichier labels.txt
DATA_PATH = "../data"
users = []
for user in os.listdir(DATA_PATH):
user_path = DATA_PATH+"/"+user
has_label = "labels.txt" in os.listdir(user_path)
users.append([int(user),user_path,int(has_label)])
df_users = pd.DataFrame(users,columns=["user_id","user_path","has_label"])
df_users
| user_id | user_path | has_label | |
|---|---|---|---|
| 0 | 0 | ../data/000 | 0 |
| 1 | 1 | ../data/001 | 0 |
| 2 | 2 | ../data/002 | 0 |
| 3 | 3 | ../data/003 | 0 |
| 4 | 4 | ../data/004 | 0 |
| ... | ... | ... | ... |
| 177 | 177 | ../data/177 | 0 |
| 178 | 178 | ../data/178 | 0 |
| 179 | 179 | ../data/179 | 1 |
| 180 | 180 | ../data/180 | 0 |
| 181 | 181 | ../data/181 | 0 |
182 rows × 3 columns
# Première idée de DataFrame
with open("../data/010/Trajectory/20070804033032.plt") as f:
trajectory_test = [i.split(",") for i in f.read().split("\n")[6:]]
headers = ["Latitude","Longitude","Altitude","Date","Horaire"]
df_test = pd.DataFrame(trajectory_test)
df_test = df_test.drop([2, 4], axis=1).dropna()
df_test.columns = headers
print(df_test)
Latitude Longitude Altitude Date Horaire 0 39.921712 116.472343 13 2007-08-04 03:30:32 1 39.921705 116.472343 13 2007-08-04 03:30:33 2 39.921695 116.472345 13 2007-08-04 03:30:34 3 39.921683 116.472342 13 2007-08-04 03:30:35 4 39.921672 116.472342 13 2007-08-04 03:30:36 ... ... ... ... ... ... 1111 39.902912 116.421455 180 2007-08-04 04:14:32 1112 39.902908 116.421432 180 2007-08-04 04:14:33 1113 39.902903 116.421413 180 2007-08-04 04:14:35 1114 39.902892 116.42133 180 2007-08-04 04:14:45 1115 39.902885 116.4213 180 2007-08-04 04:14:46 [1116 rows x 5 columns]
# Fonction pour récupérer la date de départ de chaque trajet
def date_depart(file):
with open(file) as f:
data = f.readlines()[6]
buff_data = data.replace("\n","").split(",")
date = buff_data[5]
return date
date_depart("../data/010/Trajectory/20070804033032.plt")
'2007-08-04'
# Fonction pour récupérer les coordonnées du point de départ
def coord_depart(file):
with open(file) as f:
data = f.readlines()[6]
buff_data = data.replace("\n","").split(",")
depart_coord = [buff_data[0], buff_data[1]]
return depart_coord
# Fonction pour récupérer l'horaire du point de départ
def horaire_depart(file):
with open(file) as f:
data = f.readlines()[6]
buff_data = data.replace("\n","").split(",")
depart_horaire = buff_data[5] + "/" + buff_data[6]
return depart_horaire
# Fonction pour récupérer les coordonnées du point d'arrivée
def coord_arrive(file):
with open(file, 'r') as f:
data = f.readlines()[-1]
buff_data = data.replace("\n","").split(",")
arrive_coord = [buff_data[0], buff_data[1]]
return arrive_coord
# Fonction pour récupérer l'horaire du point d'arrivée
def horaire_arrive(file):
with open(file, 'r') as f:
data = f.readlines()[-1]
buff_data = data.replace("\n","").split(",")
arrive_horaire = buff_data[5] + "/" + buff_data[6]
return arrive_horaire
# Fonction pour calculer le temps de trajet
def temps(tps_depart, tps_arrive):
## Calcul du temps
format_data = "%Y-%m-%d/%H:%M:%S"
depart = datetime.strptime(tps_depart, format_data)
arrive = datetime.strptime(tps_arrive, format_data)
time_delta = arrive - depart
return time_delta
# Fonction pour calculer la distance d'un trajet
def distance(file):
with open(file, 'r') as f:
distance = 0
lines = f.readlines()[6:]
for i in range(len(lines)-2):
buff_line = lines[i].split(",")
buff_line2 = lines[i+1].split(",")
coords_1 = (buff_line[0], buff_line[1])
coords_2 = (buff_line2[0], buff_line2[1])
distance += geopy.distance.geodesic(coords_1, coords_2).km
return distance
# Test avec les 10 premiers trajets de User10
buffer_file = os.listdir("../data/010/Trajectory/")
tab_file_u40 = buffer_file[:40]
tab_file_u40
['20070804033032.plt', '20070804155303.plt', '20070805070503.plt', '20070828171302.plt', '20070830203928.plt', '20070901022340.plt', '20070903095208.plt', '20070905163053.plt', '20070906204521.plt', '20070907075003.plt', '20070908081710.plt', '20070910074631.plt', '20070910204430.plt', '20070919121546.plt', '20070919122147.plt', '20070920074804.plt', '20070921120306.plt', '20070922145432.plt', '20070923144426.plt', '20071014161216.plt', '20071017220238.plt', '20071019055858.plt', '20071020030002.plt', '20071021110759.plt', '20071023200800.plt', '20071024084732.plt', '20071025110456.plt', '20071026213100.plt', '20071026214019.plt', '20071029075850.plt', '20071117170827.plt', '20071214175547.plt', '20071228075610.plt', '20071228190036.plt', '20071229083859.plt', '20071230073845.plt', '20071231060928.plt', '20071231130809.plt', '20071231170243.plt', '20080328144824.plt']
%%time
final_tab2 = []
headers2 = ["USER_ID","TRAJET_ID","DATE", "DEPART","ARRIVE","DISTANCE", "TEMPS", "TYPE-TRANSPORT"]
for file in buffer_file:
buff_file = "../data/010/Trajectory/" + file
buff_date = date_depart(buff_file)
buff_depart = coord_depart(buff_file)
buff_arrive = coord_arrive(buff_file)
buff_distance = distance(buff_file)
buff_tps_depart = horaire_depart(buff_file)
buff_tps_arrive = horaire_arrive(buff_file)
buff_temps = temps(buff_tps_depart, buff_tps_arrive)
transport = 'velo'
final_tab2.append(['10', file, buff_date, buff_depart, buff_arrive, buff_distance, buff_temps, transport])
df_test2 = pd.DataFrame(final_tab2)
df_test2.columns = headers2
# Trie du dataframe en fonction de la colonne [Date]
df_test2 = df_test2.sort_values(by = 'DATE')
df_test2
CPU times: total: 1min 26s Wall time: 2min 7s
| USER_ID | TRAJET_ID | DATE | DEPART | ARRIVE | DISTANCE | TEMPS | TYPE-TRANSPORT | |
|---|---|---|---|---|---|---|---|---|
| 0 | 10 | 20070804033032.plt | 2007-08-04 | [39.921712, 116.472343] | [39.902885, 116.4213] | 7.579780 | 0 days 00:44:14 | velo |
| 1 | 10 | 20070804155303.plt | 2007-08-04 | [42.017857, 123.506235] | [42.258245, 123.790855] | 36.828315 | 0 days 00:19:49 | velo |
| 2 | 10 | 20070805070503.plt | 2007-08-05 | [44.589055, 129.603843] | [44.180203, 125.49309] | 521.588151 | 0 days 10:08:07 | velo |
| 3 | 10 | 20070828171302.plt | 2007-08-28 | [39.900917, 116.420018] | [39.118588, 117.24275] | 141.574358 | 0 days 01:08:41 | velo |
| 4 | 10 | 20070830203928.plt | 2007-08-30 | [39.12299, 117.244615] | [39.135748, 117.219655] | 3.740095 | 0 days 00:17:29 | velo |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 156 | 10 | 20090227210324.plt | 2009-02-27 | [39.991438, 116.329449] | [39.137058, 117.219798] | 136.516925 | 0 days 02:27:11 | velo |
| 157 | 10 | 20090301131323.plt | 2009-03-01 | [39.136643, 117.218026] | [39.991345, 116.32791] | 135.217493 | 0 days 02:01:29 | velo |
| 158 | 10 | 20090307044707.plt | 2009-03-07 | [39.991198, 116.330976] | [39.992358, 116.325195] | 136.453900 | 0 days 05:24:23 | velo |
| 159 | 10 | 20090315093133.plt | 2009-03-15 | [39.994676, 116.326561] | [39.992513, 116.326305] | 41.408222 | 0 days 04:31:43 | velo |
| 160 | 10 | 20090321032156.plt | 2009-03-21 | [39.9921, 116.331613] | [39.136253, 117.21831] | 132.865119 | 0 days 02:12:57 | velo |
161 rows × 8 columns
# L'index devient la DATE du trajet afin de faire des stats dessus
df_test2.DATE = pd.to_datetime(df_test2.DATE)
df_test2.set_index('DATE', inplace=True)
df_test2
| USER_ID | TRAJET_ID | DEPART | ARRIVE | DISTANCE | TEMPS | TYPE-TRANSPORT | |
|---|---|---|---|---|---|---|---|
| DATE | |||||||
| 2007-08-04 | 10 | 20070804033032.plt | [39.921712, 116.472343] | [39.902885, 116.4213] | 7.579780 | 0 days 00:44:14 | velo |
| 2007-08-04 | 10 | 20070804155303.plt | [42.017857, 123.506235] | [42.258245, 123.790855] | 36.828315 | 0 days 00:19:49 | velo |
| 2007-08-05 | 10 | 20070805070503.plt | [44.589055, 129.603843] | [44.180203, 125.49309] | 521.588151 | 0 days 10:08:07 | velo |
| 2007-08-28 | 10 | 20070828171302.plt | [39.900917, 116.420018] | [39.118588, 117.24275] | 141.574358 | 0 days 01:08:41 | velo |
| 2007-08-30 | 10 | 20070830203928.plt | [39.12299, 117.244615] | [39.135748, 117.219655] | 3.740095 | 0 days 00:17:29 | velo |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2009-02-27 | 10 | 20090227210324.plt | [39.991438, 116.329449] | [39.137058, 117.219798] | 136.516925 | 0 days 02:27:11 | velo |
| 2009-03-01 | 10 | 20090301131323.plt | [39.136643, 117.218026] | [39.991345, 116.32791] | 135.217493 | 0 days 02:01:29 | velo |
| 2009-03-07 | 10 | 20090307044707.plt | [39.991198, 116.330976] | [39.992358, 116.325195] | 136.453900 | 0 days 05:24:23 | velo |
| 2009-03-15 | 10 | 20090315093133.plt | [39.994676, 116.326561] | [39.992513, 116.326305] | 41.408222 | 0 days 04:31:43 | velo |
| 2009-03-21 | 10 | 20090321032156.plt | [39.9921, 116.331613] | [39.136253, 117.21831] | 132.865119 | 0 days 02:12:57 | velo |
161 rows × 7 columns
# PLOT DE TOUS LES DEPLACEMENT DE L'USER
df_test2.loc['2007': '2009', 'DISTANCE'].plot()
<AxesSubplot:xlabel='DATE'>
# PLOT ENTRE 2008-09 ET 2008-10
df_test2.loc['2008-09': '2008-10', 'DISTANCE'].plot()
<AxesSubplot:xlabel='DATE'>
# Regrouper les données par semaine
# Y = Year / M = Month / W = Week / D = Day / h = Hour / m = minute
df_test2.loc['2008-09': '2008-10', 'DISTANCE'].resample('W').plot()
plt.show()
# Regrouper les données par semaine
# Y = Year / M = Month / W = Week / D = Day / h = Hour / m = minute
# Ici cas dans une semaine
df_test2.loc['2008-10-01': '2008-10-07', 'DISTANCE'].plot()
plt.show()
# Regrouper les données par semaine
# Y = Year / M = Month / W = Week / D = Day / h = Hour / m = minute
# Ici cas dans une semaine
df_test2.loc['2008-10-08': '2008-10-14', 'DISTANCE'].plot()
plt.show()
# AJOUT .mean() POUR AVOIR LA MOYENNE ICI ENTRE 2007 et 2009 SUR CHAQUE MOIS
# On peut faire 2W pour avoir la moyenne sur 2 semaine
df_test2.loc['2007': '2009', 'DISTANCE'].resample('W').mean().plot()
plt.show()
df_test2.loc['2007': '2009', 'DISTANCE'].resample('W').mean()
DATE
2007-08-05 188.665415
2007-08-12 NaN
2007-08-19 NaN
2007-08-26 NaN
2007-09-02 52.694549
...
2009-02-22 NaN
2009-03-01 135.867209
2009-03-08 136.453900
2009-03-15 41.408222
2009-03-22 132.865119
Freq: W-SUN, Name: DISTANCE, Length: 86, dtype: float64
v2 = df_test2.loc['2007': '2009', 'DISTANCE']
v2 = v2.dropna(axis=0)
v2.resample('Y').plot()
plt.show()
# Tableau qui contient TOUTES les statistiques regrouper par semaine
m = df_test2['DISTANCE'].resample('W').agg(['mean', 'min', 'max'])
m = m.dropna(axis=0)
m
| mean | min | max | |
|---|---|---|---|
| DATE | |||
| 2007-08-05 | 188.665415 | 7.579780 | 521.588151 |
| 2007-09-02 | 52.694549 | 3.740095 | 141.574358 |
| 2007-09-09 | 117.010763 | 0.534408 | 152.263180 |
| 2007-09-16 | 141.174913 | 141.151165 | 141.198662 |
| 2007-09-23 | 487.017248 | 0.000000 | 1896.601707 |
| 2007-10-14 | 8.474745 | 8.474745 | 8.474745 |
| 2007-10-21 | 452.405129 | 13.074198 | 903.058800 |
| 2007-10-28 | 117.826150 | 0.953561 | 357.597888 |
| 2007-11-04 | 172.599180 | 172.599180 | 172.599180 |
| 2007-11-18 | 2.386127 | 2.386127 | 2.386127 |
| 2007-12-16 | 0.874360 | 0.874360 | 0.874360 |
| 2007-12-30 | 371.297449 | 7.598410 | 1336.970124 |
| 2008-01-06 | 76.233231 | 2.616714 | 173.872079 |
| 2008-03-30 | 789.402972 | 40.563641 | 1482.945005 |
| 2008-04-06 | 875.530523 | 9.799302 | 1488.715699 |
| 2008-05-18 | 30.094051 | 30.094051 | 30.094051 |
| 2008-05-25 | 158.996456 | 158.996456 | 158.996456 |
| 2008-06-15 | 78.460888 | 1.804055 | 155.117721 |
| 2008-06-22 | 412.071493 | 64.587769 | 1016.879548 |
| 2008-06-29 | 9.678511 | 6.875718 | 12.481304 |
| 2008-08-03 | 1480.011872 | 1480.011872 | 1480.011872 |
| 2008-09-21 | 78.940948 | 12.630421 | 134.496513 |
| 2008-09-28 | 207.047476 | 11.731297 | 1889.460371 |
| 2008-10-05 | 978.507356 | 12.559579 | 2039.850203 |
| 2008-10-12 | 111.524805 | 9.099131 | 137.478298 |
| 2008-10-19 | 80.373230 | 5.465194 | 140.088022 |
| 2008-10-26 | 9.072080 | 8.392449 | 9.751710 |
| 2008-11-02 | 58.790511 | 12.940038 | 133.391211 |
| 2008-11-09 | 58.509264 | 12.889444 | 140.571643 |
| 2008-12-07 | 128.070790 | 8.253144 | 231.666171 |
| 2008-12-14 | 140.366129 | 133.256700 | 147.475559 |
| 2008-12-21 | 2929.076701 | 2929.076701 | 2929.076701 |
| 2008-12-28 | 113.490308 | 0.690497 | 165.621845 |
| 2009-01-04 | 93.817988 | 4.743728 | 143.685910 |
| 2009-01-11 | 149.108992 | 129.426850 | 168.791134 |
| 2009-01-18 | 46.443713 | 23.816281 | 89.971945 |
| 2009-01-25 | 137.989135 | 137.989135 | 137.989135 |
| 2009-02-01 | 129.980518 | 129.980518 | 129.980518 |
| 2009-02-08 | 138.322373 | 133.470686 | 143.174060 |
| 2009-02-15 | 154.286165 | 135.680966 | 172.891365 |
| 2009-03-01 | 135.867209 | 135.217493 | 136.516925 |
| 2009-03-08 | 136.453900 | 136.453900 | 136.453900 |
| 2009-03-15 | 41.408222 | 41.408222 | 41.408222 |
| 2009-03-22 | 132.865119 | 132.865119 | 132.865119 |
# On va se concentrer sur 2008 car c'est celui qui à le plus de valeurs
plt.figure(figsize=(12,8))
m['mean']['2007'].plot(label= 'moyenne en 2007', lw=2, ls='--', alpha=0.8)
m['mean']['2008'].plot(label= 'moyenne en 2008')
m['mean']['2009'].plot(label= 'moyenne en 2008', lw=3, ls=':', alpha=0.8)
#plt.fill_between(m.index, m['max'], m['min'], alpha=0.2, label='min-max par semaine')
plt.legend()
plt.show()
df_test2.loc['2007': '2009', 'DISTANCE'].resample('Y').mean().plot(label= 'moyenne regroupé par année', lw=2, ls='--', alpha=0.8)
df_test2.loc['2007': '2009', 'DISTANCE'].resample('M').mean().plot(label= 'moyenne regroupé par mois', lw=3, ls=':', alpha=0.8)
df_test2.loc['2007': '2009', 'DISTANCE'].resample('W').mean().plot(label= 'moyenne regroupé par semaine')
plt.legend()
plt.show()
analysis = df_test2[['DISTANCE']].copy()
#decompose_result_mult = seasonal_decompose(analysis, model="additive")
# trend = decompose_result_mult.trend
# seasonal = decompose_result_mult.seasonal
# residual = decompose_result_mult.resid
#
# decompose_result_mult.plot();
analysis.index
DatetimeIndex(['2007-08-04', '2007-08-04', '2007-08-05', '2007-08-28',
'2007-08-30', '2007-09-01', '2007-09-03', '2007-09-05',
'2007-09-06', '2007-09-07',
...
'2009-01-31', '2009-02-07', '2009-02-08', '2009-02-13',
'2009-02-14', '2009-02-27', '2009-03-01', '2009-03-07',
'2009-03-15', '2009-03-21'],
dtype='datetime64[ns]', name='DATE', length=161, freq=None)
analysis
# apply the dtype attribute
result = analysis.dtypes
print("Output:")
print(result)
Output: DISTANCE float64 dtype: object
uri = 'mongodb+srv://admin:uvsqawsgroupe17@cluster0.nkdni.mongodb.net/?retryWrites=true&w=majority'
myclient = pymongo.MongoClient(uri)
mydb = myclient["DonneeGPS"]
mycol = mydb["DATAGPS"]
x = mycol.find({'USER_ID': '000'})
y = mycol.find({'USER_ID': '001'})
z = mycol.find({'USER_ID': '010'})
# PLOT DE TOUS LES DEPLACEMENT DE L'USER
# Utilisateur 000
df_plot = pd.DataFrame.from_dict(x)
df_plot_001 = pd.DataFrame.from_dict(y)
df_plot_010 = pd.DataFrame.from_dict(z)
df_plot_010
| _id | USER_ID | DATE | DEPART | ARRIVE | DISTANCE | TEMPS | TYPE-TRANSPORT | DOM-TRAV | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 638e32ba429ed8f316063d75 | 010 | 2007-08-04/03:30:32 | (39.921712, 116.472343) | (39.920587, 116.472327) | 0.15591323546084734 | 0:02:26 | marche | UNDEFINED |
| 1 | 638e32ba429ed8f316063d76 | 010 | 2007-08-04/03:32:58 | (39.920612, 116.47233) | (39.919843, 116.472015) | 0.09839271676975873 | 0:00:30 | velo | UNDEFINED |
| 2 | 638e32ba429ed8f316063d77 | 010 | 2007-08-04/03:33:28 | (39.919847, 116.472015) | (39.919595, 116.47183) | 0.041901067721451773 | 0:01:18 | marche | UNDEFINED |
| 3 | 638e32ba429ed8f316063d78 | 010 | 2007-08-04/03:34:46 | (39.91961, 116.471828) | (39.918383, 116.472317) | 0.1493635058754237 | 0:00:33 | velo | UNDEFINED |
| 4 | 638e32ba429ed8f316063d79 | 010 | 2007-08-04/03:35:19 | (39.918387, 116.472333) | (39.916862, 116.471713) | 0.23231533390805081 | 0:02:58 | marche | UNDEFINED |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12993 | 638e32bc429ed8f316067036 | 010 | 2009-03-21/05:29:50 | (39.129293, 117.21125) | (39.130798, 117.209983) | 0.1946596240649425 | 0:00:22 | voiture/bus/taxi | UNDEFINED |
| 12994 | 638e32bc429ed8f316067037 | 010 | 2009-03-21/05:30:12 | (39.13075, 117.210046) | (39.135231, 117.214251) | 0.7305619825803901 | 0:01:07 | voiture/bus/taxi | UNDEFINED |
| 12995 | 638e32bc429ed8f316067038 | 010 | 2009-03-21/05:31:19 | (39.135206, 117.214216) | (39.13528, 117.214281) | 0.015433977507834774 | 0:00:22 | marche | UNDEFINED |
| 12996 | 638e32bc429ed8f316067039 | 010 | 2009-03-21/05:31:41 | (39.13528, 117.214281) | (39.135571, 117.216216) | 0.17897305062830568 | 0:00:44 | velo | UNDEFINED |
| 12997 | 638e32bc429ed8f31606703a | 010 | 2009-03-21/05:32:25 | (39.135546, 117.216201) | (39.136256, 117.218303) | 0.22277101475818747 | 0:02:28 | marche | UNDEFINED |
12998 rows × 9 columns
df_plot["TYPE-TRANSPORT"].value_counts()
marche 1175 voiture/bus/taxi 760 velo 692 train 38 Name: TYPE-TRANSPORT, dtype: int64
fig = px.bar(df_plot, x="TYPE-TRANSPORT", title='TYPE-TRANSPORT')
fig.show()
fig = px.bar(df_plot_001, x="TYPE-TRANSPORT", title='TYPE-TRANSPORT')
fig.show()
# La fonction pour le plot par mois
dates = df_plot.DATE.tolist()
months = [date[:7] for date in dates]
df_plot["MONTH"] = months
fig = px.bar(df_plot, x="MONTH",color='TYPE-TRANSPORT',barmode='group',height=400)
fig.update_traces(dict(marker_line_width=0))
fig.show()
# L'index devient la DATE du trajet afin de faire des stats dessus
df_plot_copy = df_plot.copy()
df_plot_copy.DATE = pd.to_datetime(df_plot_copy.DATE)
df_plot_copy.set_index('DATE', inplace=True)
# Tableau qui contient TOUTES les statistiques regrouper par semaine
df_plot_copy.DISTANCE = df_plot_copy.DISTANCE.astype(float)
m = df_plot_copy['DISTANCE'].resample('W').agg(['mean', 'min', 'max'])
m = m.dropna(axis=0)
m
| mean | min | max | |
|---|---|---|---|
| DATE | |||
| 2008-10-26 | 1.534044 | 0.092277 | 6.134741 |
| 2008-11-02 | 1.727254 | 0.083017 | 7.527443 |
| 2008-11-09 | 3.530503 | 0.424163 | 8.943539 |
| 2008-11-16 | 1.232348 | 0.068799 | 4.386517 |
| 2008-11-23 | 0.825494 | 0.023192 | 4.073307 |
| 2008-12-07 | 0.728412 | 0.003140 | 2.103240 |
| 2008-12-14 | 1.164513 | 0.034702 | 3.284624 |
| 2009-04-05 | 1.127259 | 0.000140 | 10.211117 |
| 2009-04-12 | 1.545150 | 0.000634 | 17.813733 |
| 2009-04-19 | 1.640630 | 0.001691 | 16.519590 |
| 2009-04-26 | 1.300816 | 0.018620 | 9.245312 |
| 2009-05-03 | 1.292563 | 0.012914 | 10.432872 |
| 2009-05-10 | 1.161325 | 0.032873 | 3.948896 |
| 2009-05-17 | 1.045953 | 0.038384 | 3.041003 |
| 2009-05-24 | 1.263743 | 0.008401 | 8.971864 |
| 2009-05-31 | 1.088704 | 0.061086 | 5.466519 |
| 2009-06-07 | 1.245453 | 0.036099 | 8.298692 |
| 2009-06-14 | 1.097408 | 0.002445 | 4.219826 |
| 2009-06-21 | 1.157959 | 0.012138 | 4.387961 |
| 2009-06-28 | 1.316769 | 0.038310 | 10.093630 |
| 2009-07-05 | 1.184727 | 0.000420 | 13.338901 |
# On va se concentrer sur 2008 car c'est celui qui à le plus de valeurs
plt.figure(figsize=(12,8))
m['mean']['2008'].plot(label= 'moyenne en 2008', lw=2, ls='--', alpha=0.8)
m['mean']['2009'].plot(label= 'moyenne en 2009', lw=3, ls=':', alpha=0.8)
#plt.fill_between(m.index, m['max'], m['min'], alpha=0.2, label='min-max par semaine')
plt.legend()
plt.show()
# Transport
buff_marche = (df_plot['TYPE-TRANSPORT'] == 'marche').sum()
buff_train = (df_plot['TYPE-TRANSPORT'] == 'train').sum()
buff_velo = (df_plot['TYPE-TRANSPORT'] == 'velo').sum()
buff_voiture = (df_plot['TYPE-TRANSPORT'] == 'voiture/bus/taxi').sum()
buff_avion = (df_plot['TYPE-TRANSPORT'] == 'airplane').sum()
df_plot
| _id | USER_ID | DATE | DEPART | ARRIVE | DISTANCE | TEMPS | TYPE-TRANSPORT | DOM-TRAV | MONTH | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 638e32b6429ed8f31605d088 | 000 | 2008-10-23/02:53:04 | (39.984702, 116.318417) | (39.984045, 116.298725) | 2.0723741546645145 | 0:12:11 | velo | UNDEFINED | 2008-10 |
| 1 | 638e32b6429ed8f31605d089 | 000 | 2008-10-23/02:53:04 | (39.984702, 116.318417) | (39.984045, 116.298725) | 2.0723741546645145 | 0:12:11 | velo | UNDEFINED | 2008-10 |
| 2 | 638e32b6429ed8f31605d08a | 000 | 2008-10-23/04:08:07 | (39.995777, 116.286798) | (39.984498, 116.299407) | 2.0763510121490385 | 0:02:00 | voiture/bus/taxi | UNDEFINED | 2008-10 |
| 3 | 638e32b6429ed8f31605d08b | 000 | 2008-10-23/04:10:07 | (39.984499, 116.299405) | (39.990325, 116.310258) | 1.7731990619742037 | 0:09:55 | velo | UNDEFINED | 2008-10 |
| 4 | 638e32b6429ed8f31605d08c | 000 | 2008-10-23/04:20:02 | (39.990219, 116.310215) | (39.990887, 116.310479) | 0.0922767965163593 | 0:01:35 | marche | UNDEFINED | 2008-10 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2660 | 638e32b6429ed8f31605daec | 000 | 2009-07-05/07:33:15 | (39.984025, 116.306875) | (39.984199, 116.308835) | 0.24069283883115963 | 0:01:14 | velo | UNDEFINED | 2009-07 |
| 2661 | 638e32b6429ed8f31605daed | 000 | 2009-07-05/07:34:29 | (39.984182, 116.308526) | (39.991446, 116.323739) | 2.0499225963154446 | 0:03:25 | voiture/bus/taxi | UNDEFINED | 2009-07 |
| 2662 | 638e32b6429ed8f31605daee | 000 | 2009-07-05/07:37:54 | (39.991416, 116.323173) | (39.996502, 116.328972) | 1.0899619160797875 | 0:03:29 | velo | UNDEFINED | 2009-07 |
| 2663 | 638e32b6429ed8f31605daef | 000 | 2009-07-05/07:41:23 | (39.996488, 116.328608) | (39.999958, 116.327362) | 0.5767519160190993 | 0:01:15 | voiture/bus/taxi | UNDEFINED | 2009-07 |
| 2664 | 638e32b6429ed8f31605daf0 | 000 | 2009-07-05/07:42:38 | (39.999985, 116.327424) | (40.000522, 116.327132) | 0.10156384139126953 | 0:02:37 | marche | UNDEFINED | 2009-07 |
2665 rows × 10 columns
# Regrouper les données par semaine
# Y = Year / M = Month / W = Week / D = Day / h = Hour / m = minute
df_plot_copy.loc['2008-10': '2008-11', 'DISTANCE'].resample('W').plot()
plt.show()
fig = px.bar(df_plot, x='DEPART')
fig.show()
buff = df_plot_copy.groupby('TYPE-TRANSPORT').sum()['DISTANCE'].sort_values()
fig = px.bar(buff, x="DISTANCE", title='DISTANCE TOTAL PAR TYPE-TRANSPORT (USER 000)')
fig.show()
C:\Program Files\KMSpico\temp\ipykernel_104252\2862209076.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
lat = []
long = []
count = [1 for i in range(len(df_plot))]
for depart in df_plot.DEPART.tolist():
coord = depart.replace(")","").replace("(","").split(",")
lat.append(float(coord[0]))
long.append(float(coord[1]))
fig = px.density_mapbox(df_plot, lat=lat, lon=long, z=count,
mapbox_style="stamen-terrain")
fig
# Comparison USER 000 et USER 001
# inner = assemble les dataframe sur les valeurs en communs
fusion = pd.merge(df_plot, df_plot_001, on='TYPE-TRANSPORT', how='inner', suffixes=('_user000', '_user001'))
fusion
| _id_user000 | USER_ID_user000 | DATE_user000 | DEPART_user000 | ARRIVE_user000 | DISTANCE_user000 | TEMPS_user000 | TYPE-TRANSPORT | DOM-TRAV_user000 | MONTH | _id_user001 | USER_ID_user001 | DATE_user001 | DEPART_user001 | ARRIVE_user001 | DISTANCE_user001 | TEMPS_user001 | DOM-TRAV_user001 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 638e32b6429ed8f31605d088 | 000 | 2008-10-23/02:53:04 | (39.984702, 116.318417) | (39.984045, 116.298725) | 2.0723741546645145 | 0:12:11 | velo | UNDEFINED | 2008-10 | 638e32b6429ed8f31605daf1 | 001 | 2008-10-23/05:53:05 | (39.984094, 116.319236) | (39.978051, 116.327538) | 1.5427186349264368 | 0:08:52 | UNDEFINED |
| 1 | 638e32b6429ed8f31605d088 | 000 | 2008-10-23/02:53:04 | (39.984702, 116.318417) | (39.984045, 116.298725) | 2.0723741546645145 | 0:12:11 | velo | UNDEFINED | 2008-10 | 638e32b6429ed8f31605daf2 | 001 | 2008-10-23/05:53:05 | (39.984094, 116.319236) | (39.978051, 116.327538) | 1.5427186349264368 | 0:08:52 | UNDEFINED |
| 2 | 638e32b6429ed8f31605d088 | 000 | 2008-10-23/02:53:04 | (39.984702, 116.318417) | (39.984045, 116.298725) | 2.0723741546645145 | 0:12:11 | velo | UNDEFINED | 2008-10 | 638e32b6429ed8f31605daf5 | 001 | 2008-10-23/10:35:36 | (39.98028, 116.326988) | (39.983215, 116.326708) | 0.3188556326688519 | 0:00:59 | UNDEFINED |
| 3 | 638e32b6429ed8f31605d088 | 000 | 2008-10-23/02:53:04 | (39.984702, 116.318417) | (39.984045, 116.298725) | 2.0723741546645145 | 0:12:11 | velo | UNDEFINED | 2008-10 | 638e32b6429ed8f31605daf6 | 001 | 2008-10-23/10:36:35 | (39.983115, 116.326693) | (40.009928, 116.314928) | 3.876949395296889 | 0:19:35 | UNDEFINED |
| 4 | 638e32b6429ed8f31605d088 | 000 | 2008-10-23/02:53:04 | (39.984702, 116.318417) | (39.984045, 116.298725) | 2.0723741546645145 | 0:12:11 | velo | UNDEFINED | 2008-10 | 638e32b6429ed8f31605daf8 | 001 | 2008-10-23/10:57:09 | (40.009645, 116.312623) | (40.01615, 116.307288) | 1.076005654344659 | 0:06:04 | UNDEFINED |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1409307 | 638e32b6429ed8f31605d9fd | 000 | 2009-06-28/09:27:07 | (31.109591, 121.077294) | (31.137361, 121.129715) | 5.829406390378319 | 0:03:20 | train | UNDEFINED | 2009-06 | 638e32b6429ed8f31605dc42 | 001 | 2008-11-01/04:16:32 | (40.07093, 116.299532) | (40.060837, 116.295175) | 1.8610431637789926 | 0:01:17 | UNDEFINED |
| 1409308 | 638e32b6429ed8f31605d9fe | 000 | 2009-06-28/09:30:27 | (31.137328, 121.128074) | (31.148903, 121.232573) | 10.093629729871989 | 0:06:40 | train | UNDEFINED | 2009-06 | 638e32b6429ed8f31605dc42 | 001 | 2008-11-01/04:16:32 | (40.07093, 116.299532) | (40.060837, 116.295175) | 1.8610431637789926 | 0:01:17 | UNDEFINED |
| 1409309 | 638e32b6429ed8f31605d9ff | 000 | 2009-06-28/09:37:07 | (31.148914, 121.231212) | (31.150603, 121.25979) | 2.622502561565015 | 0:01:40 | train | UNDEFINED | 2009-06 | 638e32b6429ed8f31605dc42 | 001 | 2008-11-01/04:16:32 | (40.07093, 116.299532) | (40.060837, 116.295175) | 1.8610431637789926 | 0:01:17 | UNDEFINED |
| 1409310 | 638e32b6429ed8f31605da00 | 000 | 2009-06-28/09:38:47 | (31.150392, 121.258572) | (31.154602, 121.284589) | 2.384399904536095 | 0:01:40 | train | UNDEFINED | 2009-06 | 638e32b6429ed8f31605dc42 | 001 | 2008-11-01/04:16:32 | (40.07093, 116.299532) | (40.060837, 116.295175) | 1.8610431637789926 | 0:01:17 | UNDEFINED |
| 1409311 | 638e32b6429ed8f31605da31 | 000 | 2009-06-29/02:12:25 | (31.29325, 121.438485) | (31.209642, 121.373726) | 13.338901129116042 | 0:10:00 | train | UNDEFINED | 2009-06 | 638e32b6429ed8f31605dc42 | 001 | 2008-11-01/04:16:32 | (40.07093, 116.299532) | (40.060837, 116.295175) | 1.8610431637789926 | 0:01:17 | UNDEFINED |
1409312 rows × 18 columns
# Changement String --> Float pour les plots
fusion.DISTANCE_user000 = fusion.DISTANCE_user000.astype(float)
fusion.DISTANCE_user001 = fusion.DISTANCE_user001.astype(float)
fusion[['DISTANCE_user000']].plot(figsize=(12, 8))
<AxesSubplot:>